Frame

setwd("~/Dropbox/github/art-data-science/notebook/onion")
library(rvest)
## Loading required package: xml2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(prophet)
## Loading required package: Rcpp
library(ggmap)
## 
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
## 
##     wind

Acquire

pg.out <- read_html("MonthWiseMarketArrivalsNew.htm")
pg.table <- pg.out %>%
            html_node("#dnn_ctr974_MonthWiseMarketArrivals_GridView1") %>%
            html_table()
str(pg.table)
## 'data.frame':    3784 obs. of  7 variables:
##  $ Market              : chr  "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
##  $ Month Name          : chr  "January" "January" "January" "February" ...
##  $ Year                : chr  "2014" "2015" "2017" "2014" ...
##  $ Arrival (q)         : int  440 1305 200 1115 1115 1300 920 670 1350 940 ...
##  $ Price Minimum (Rs/q): chr  "1025" "1309" "750" "831" ...
##  $ Price Maximum (Rs/q): chr  "1481" "1858" "1000" "1163" ...
##  $ Modal Price (Rs/q)  : chr  "1256" "1613" "850" "983" ...
df <- pg.table

Refine

dim(df)
## [1] 3784    7
column_names <- c('market', 'month', 'year', 'quantity', 'priceMin', 'priceMax', 'priceMod')
colnames(df) <- column_names
head(df)
##       market    month year quantity priceMin priceMax priceMod
## 1 ABOHAR(PB)  January 2014      440     1025     1481     1256
## 2 ABOHAR(PB)  January 2015     1305     1309     1858     1613
## 3 ABOHAR(PB)  January 2017      200      750     1000      850
## 4 ABOHAR(PB) February 2014     1115      831     1163      983
## 5 ABOHAR(PB) February 2015     1115     1200     1946     1688
## 6 ABOHAR(PB)    March 2014     1300      844     1181      994
tail(df)
##         market    month  year  quantity priceMin  priceMax  priceMod
## 3779 YEOLA(MS) November  2015     62522     1003      2460      1996
## 3780 YEOLA(MS) November  2016     45954      207       780       572
## 3781 YEOLA(MS) December  2014    201077      446      1654      1456
## 3782 YEOLA(MS) December  2015    223315      609      1446      1126
## 3783 YEOLA(MS) December  2016    214937      256       753       634
## 3784                    Total 268749947 841(Avg) 1544(Avg) 1249(Avg)
df <- df %>%
      filter(year != "Total")
dim(df)
## [1] 3783    7
df$quantity <- as.numeric(df$quantity)
df$year     <- as.numeric(df$year)
df$priceMin <- as.numeric(df$priceMin)
df$priceMax <- as.numeric(df$priceMax)
df$priceMod <- as.numeric(df$priceMod)
str(df)
## 'data.frame':    3783 obs. of  7 variables:
##  $ market  : chr  "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
##  $ month   : chr  "January" "January" "January" "February" ...
##  $ year    : num  2014 2015 2017 2014 2015 ...
##  $ quantity: num  440 1305 200 1115 1115 ...
##  $ priceMin: num  1025 1309 750 831 1200 ...
##  $ priceMax: num  1481 1858 1000 1163 1946 ...
##  $ priceMod: num  1256 1613 850 983 1688 ...
df <- df %>%
      mutate(market1 = market) %>%
      separate(market1, c("city", "state"), sep = "\\(")
## Warning: Too many values at 99 locations: 840, 841, 842, 843, 844, 845,
## 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, ...
## Warning: Too few values at 535 locations: 351, 352, 353, 354, 355, 356,
## 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, ...
df$state <- df$state %>% str_replace("\\)","")
df <- df %>%
      mutate(state = ifelse(is.na(state), market, state))
head(df)
##       market    month year quantity priceMin priceMax priceMod   city
## 1 ABOHAR(PB)  January 2014      440     1025     1481     1256 ABOHAR
## 2 ABOHAR(PB)  January 2015     1305     1309     1858     1613 ABOHAR
## 3 ABOHAR(PB)  January 2017      200      750     1000      850 ABOHAR
## 4 ABOHAR(PB) February 2014     1115      831     1163      983 ABOHAR
## 5 ABOHAR(PB) February 2015     1115     1200     1946     1688 ABOHAR
## 6 ABOHAR(PB)    March 2014     1300      844     1181      994 ABOHAR
##   state
## 1    PB
## 2    PB
## 3    PB
## 4    PB
## 5    PB
## 6    PB
df <- df %>%
      mutate(date = paste(month, year, sep="-"))
df$date = as.Date(paste("01-",df$date,sep=""), "%d-%B-%Y")
str(df)
## 'data.frame':    3783 obs. of  10 variables:
##  $ market  : chr  "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
##  $ month   : chr  "January" "January" "January" "February" ...
##  $ year    : num  2014 2015 2017 2014 2015 ...
##  $ quantity: num  440 1305 200 1115 1115 ...
##  $ priceMin: num  1025 1309 750 831 1200 ...
##  $ priceMax: num  1481 1858 1000 1163 1946 ...
##  $ priceMod: num  1256 1613 850 983 1688 ...
##  $ city    : chr  "ABOHAR" "ABOHAR" "ABOHAR" "ABOHAR" ...
##  $ state   : chr  "PB" "PB" "PB" "PB" ...
##  $ date    : Date, format: "2014-01-01" "2015-01-01" ...

Transform

df2016 <- df %>%
          filter(year == 2016)
head(df2016)
##       market     month year quantity priceMin priceMax priceMod   city
## 1 ABOHAR(PB)      June 2016      710      523      755      613 ABOHAR
## 2 ABOHAR(PB)    August 2016       60      650     1065      850 ABOHAR
## 3 ABOHAR(PB) September 2016      400      650      800      725 ABOHAR
## 4 ABOHAR(PB)   October 2016      350      700      900      850 ABOHAR
## 5 ABOHAR(PB)  November 2016      590      775     1059      888 ABOHAR
## 6 ABOHAR(PB)  December 2016      330      688     1100      873 ABOHAR
##   state       date
## 1    PB 2016-06-01
## 2    PB 2016-08-01
## 3    PB 2016-09-01
## 4    PB 2016-10-01
## 5    PB 2016-11-01
## 6    PB 2016-12-01

Split-Apply-Combine

df2016City <- df %>%
              filter(year == 2016) %>%
              group_by(city) %>%
              summarise(quantity_year = sum(quantity)) %>%
              arrange(desc(quantity_year)) %>%
              filter(quantity_year > 2500000)
head(df2016City)
## # A tibble: 6 × 2
##         city quantity_year
##        <chr>         <dbl>
## 1  BANGALORE       9117473
## 2     MAHUVA       6170912
## 3 PIMPALGAON       3836046
## 4    SOLAPUR       3767140
## 5  LASALGAON       3402394
## 6       PUNE       3339194

Explore

ggplot(df2016City) + 
  aes(reorder(city, quantity_year),weight = quantity_year) + 
  geom_bar() + 
  coord_flip()

cities <- unique(df2016City$city)
cities
## [1] "BANGALORE"  "MAHUVA"     "PIMPALGAON" "SOLAPUR"    "LASALGAON" 
## [6] "PUNE"       "DELHI"      "NEWASA"     "MUMBAI"
dfCity <- df %>%
          filter( city %in% cities)
dim(dfCity)  
## [1] 348  10
ggplot(dfCity) + aes(date, priceMod, color=city) + geom_line()

g <- ggplot(dfCity) + aes(date, priceMod, color=city) + geom_line()
ggplotly(g)
dfCityTall <- dfCity %>%
              gather("priceType", "priceValue",5:7) %>%
              arrange(date)
ggplot(dfCityTall) + aes(date, y = priceValue, color = priceType) + geom_line() + facet_wrap(~city)

Model

dfBang <- df %>%
  filter(city == "BANGALORE") %>%
  select(date, priceMod) %>%
  arrange(date)

ggplot(dfBang) + aes(date, priceMod) + geom_line()

colnames(dfBang) <- c('ds', 'y')
str(dfBang)
## 'data.frame':    40 obs. of  2 variables:
##  $ ds: Date, format: "2014-01-01" "2014-02-01" ...
##  $ y : num  1094 797 748 712 941 ...
m <- prophet(dfBang)
## Warning in set_auto_seasonalities(m): Disabling weekly seasonality. Run
## prophet with `weekly.seasonality=TRUE` to override this.
## Initial log joint probability = -3.13771
## Optimization terminated normally: 
##   Convergence detected: absolute parameter change was below tolerance
future <- make_future_dataframe(m, periods = 12)
tail(future)
##            ds
## 47 2017-04-08
## 48 2017-04-09
## 49 2017-04-10
## 50 2017-04-11
## 51 2017-04-12
## 52 2017-04-13
forecast <- predict(m, future)
tail(forecast[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
##            ds     yhat yhat_lower yhat_upper
## 47 2017-04-08 738.8072   144.3805   1377.286
## 48 2017-04-09 754.8498   121.4589   1359.882
## 49 2017-04-10 768.9300   158.8456   1344.891
## 50 2017-04-11 780.7612   156.1110   1436.928
## 51 2017-04-12 790.1263   177.7936   1391.326
## 52 2017-04-13 796.8850   191.5294   1396.898
plot(m, forecast)

prophet_plot_components(m, forecast)

Insight

uniqcity <- unique(dfCity$city)
geo <- geocode(uniqcity)
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=BANGALORE&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=DELHI&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=LASALGAON&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=MAHUVA&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=MUMBAI&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=NEWASA&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PIMPALGAON&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PUNE&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=SOLAPUR&sensor=false
dfGeo <- bind_cols(df2016City, geo)
dfGeo
## # A tibble: 9 × 4
##         city quantity_year      lon      lat
##        <chr>         <dbl>    <dbl>    <dbl>
## 1  BANGALORE       9117473 77.59456 12.97160
## 2     MAHUVA       6170912 77.10249 28.70406
## 3 PIMPALGAON       3836046 74.23261 20.14914
## 4    SOLAPUR       3767140 71.77046 21.09216
## 5  LASALGAON       3402394 72.87766 19.07598
## 6       PUNE       3339194 74.92811 19.55118
## 7      DELHI       3061788 73.98738 20.16997
## 8     NEWASA       2897566 73.85674 18.52043
## 9     MUMBAI       2872669 75.90639 17.65992
ggplot(dfGeo) + aes(lon, lat, size=quantity_year/1000) + geom_point() + coord_map()

map <- get_map("India", zoom = 5)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=India&zoom=5&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=India&sensor=false
ggmap(map)

map1 <- get_map("India", maptype = "watercolor", source = "stamen", zoom = 5)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=India&zoom=5&size=640x640&scale=2&maptype=terrain&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=India&sensor=false
## Map from URL : http://tile.stamen.com/watercolor/5/21/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/21/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/21/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/21/15.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/15.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/15.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/15.jpg
ggmap(map1)

ggmap(map1) + geom_point(data = dfGeo,aes(lon,lat,size=quantity_year/1000,color=city))